We’ll be working with a mental health dataset and will be conducting exploratory data analysis, unsupervised clustering, principal component analysis, gradient boosting, and support vector machines.
To begin, the following code will import the data and load the libraries:
library(stringr)
library(tidyr)
library(dplyr)
library(ggplot2)
library(VIM)
library(corrplot)
library(purrr)
library(scales)
library(caret)
library(Hmisc)
library(naniar)
library(conflicted)
library(pheatmap)
library(corrplot)
library(factoextra)
library(gbm)
library(caret)
library(gridExtra)
# resolve function name conflict
conflict_prefer('filter', 'dplyr')
conflict_prefer('summarize', 'dplyr')
# import data
url <- 'https://raw.githubusercontent.com/SmilodonCub/Data622_group5_projects/main/ADHD_data.csv'
df <- read.csv(url, header=T, na.strings="")In order to facilitate ease of use, we’ll be renaming the columns. Additionally, we’ll convert each of the number coded fields to factors while also including the proper labels.
# convert column names to lowercase
names(df) <- lapply(names(df), tolower)
# replace periods with underscore
names(df) <- str_replace_all(names(df), '\\.', '_')
# rename last column to remove trailing underscore
names(df)[ncol(df)] <- 'psych_meds'
# raw data with renamed columns
df_raw <- df
names(df_raw)## [1] "initial" "age" "sex"
## [4] "race" "adhd_q1" "adhd_q2"
## [7] "adhd_q3" "adhd_q4" "adhd_q5"
## [10] "adhd_q6" "adhd_q7" "adhd_q8"
## [13] "adhd_q9" "adhd_q10" "adhd_q11"
## [16] "adhd_q12" "adhd_q13" "adhd_q14"
## [19] "adhd_q15" "adhd_q16" "adhd_q17"
## [22] "adhd_q18" "adhd_total" "md_q1a"
## [25] "md_q1b" "md_q1c" "md_q1d"
## [28] "md_q1e" "md_q1f" "md_q1g"
## [31] "md_q1h" "md_q1i" "md_q1j"
## [34] "md_q1k" "md_q1l" "md_q1m"
## [37] "md_q2" "md_q3" "md_total"
## [40] "alcohol" "thc" "cocaine"
## [43] "stimulants" "sedative_hypnotics" "opioids"
## [46] "court_order" "education" "hx_of_violence"
## [49] "disorderly_conduct" "suicide" "abuse"
## [52] "non_subst_dx" "subst_dx" "psych_meds"
# Sex
df$sex <- factor(df$sex, levels = c(1,2), labels = c('Male','Female'))
# Race
df$race <- factor(df$race, levels = c(1,2,3,4,5,6), labels = c('White','African American','Hispanic','Asian','Native American','Other or Missing Data'))
# ADHD q1 - q18
adhd_cols <- names(df[,5:22])
df[adhd_cols] <- lapply(df[adhd_cols], factor, levels = c(0,1,2,3,4), labels = c('Never','Rarely','Sometimes','Often','Very Often'))
# Mood Disorder q1a - q2
md_cols <- names(df[,24:37])
df[md_cols] <- lapply(df[md_cols], factor, levels = c(0,1), labels = c('No','Yes'))
# Mood Disorder q3
df$md_q3 <- factor(df$md_q3, levels = c(0,1,2,3), labels = c('No Problem','Minor','Moderate','Serious'))
# Substance Abuse
sa_cols <- names(df[,40:45])
df[sa_cols] <- lapply(df[sa_cols], factor, levels = c(0,1,2,3), labels = c('No Use','Use','Abuse','Dependence'))
# Court Order
df$court_order <- factor(df$court_order, levels = c(0,1), labels = c('No','Yes'))
# Education
# think it might be okay to leave this as a number
# History of Violence, Disorderly Conduct, Suicide Attempt
hist_cols <- names(df[,48:50])
df[hist_cols] <- lapply(df[hist_cols], factor, levels = c(0,1), labels = c('No','Yes'))
# Abuse History
df$abuse <- factor(df$abuse, levels = c(0,1,2,3,4,5,6,7),
labels = c('No','Physical','Sexual','Emotional','Physical & Sexual','Physical & Emotional','Sexual & Emotional','Physical, Sexual, & Emotional'))
# Non-Substance Related Drugs
df$non_subst_dx <- factor(df$non_subst_dx, levels = c(0,1,2), labels = c('None','One','More than one'))
# Substance Related Drugs
df$subst_dx <- factor(df$subst_dx, levels = c(0,1,2,3), labels = c('None','One','Two','Three or more'))
# Psychiatric Meds
df$psych_meds <- factor(df$psych_meds, levels = c(0,1,2), labels = c('None','One','More than one'))
# str(df)The following code will quantitatively and visually explore the nature of the dataset.
We begin by describing the dataset features.
Use dplyr’s glimpse() function to take a quick look at the data structure. Followed by Hmisc’s describe() function to return some basic summary statistics about the dataframe features:
# quick look at what the data structure looks like
glimpse(df)## Rows: 175
## Columns: 54
## $ initial <chr> "JA", "LA", "MD", "RD", "RB", "SB", "PK", "RJ", "DJ…
## $ age <int> 24, 48, 51, 43, 34, 39, 41, 48, 44, 27, 44, 56, 53,…
## $ sex <fct> Male, Female, Female, Male, Male, Female, Female, M…
## $ race <fct> White, White, White, White, White, White, White, Wh…
## $ adhd_q1 <fct> Rarely, Often, Sometimes, Often, Very Often, Someti…
## $ adhd_q2 <fct> Rarely, Often, Rarely, Often, Very Often, Often, So…
## $ adhd_q3 <fct> Very Often, Very Often, Sometimes, Sometimes, Somet…
## $ adhd_q4 <fct> Sometimes, Very Often, Rarely, Sometimes, Very Ofte…
## $ adhd_q5 <fct> Often, NA, Often, Very Often, Very Often, Often, Ve…
## $ adhd_q6 <fct> Rarely, Sometimes, Often, Often, Sometimes, Sometim…
## $ adhd_q7 <fct> Rarely, Sometimes, Often, Sometimes, Often, Often, …
## $ adhd_q8 <fct> Often, Often, Sometimes, Very Often, Very Often, Ve…
## $ adhd_q9 <fct> Sometimes, Sometimes, Never, Very Often, Very Often…
## $ adhd_q10 <fct> Very Often, Very Often, Rarely, Sometimes, Sometime…
## $ adhd_q11 <fct> Sometimes, Rarely, Sometimes, Often, Very Often, Ve…
## $ adhd_q12 <fct> Very Often, Very Often, Never, Rarely, Rarely, Some…
## $ adhd_q13 <fct> Rarely, Sometimes, Sometimes, Often, Often, Very Of…
## $ adhd_q14 <fct> Never, Very Often, Sometimes, Often, Sometimes, Ver…
## $ adhd_q15 <fct> Often, Very Often, Often, Rarely, Rarely, Often, Ve…
## $ adhd_q16 <fct> Rarely, Often, Sometimes, Sometimes, Sometimes, Ver…
## $ adhd_q17 <fct> Often, Rarely, Rarely, Rarely, Rarely, Often, Somet…
## $ adhd_q18 <fct> Very Often, Very Often, Rarely, Sometimes, Rarely, …
## $ adhd_total <int> 40, 55, 31, 45, 48, 55, 54, 41, 56, 56, 42, 38, 31,…
## $ md_q1a <fct> Yes, Yes, No, Yes, No, No, Yes, No, Yes, Yes, Yes, …
## $ md_q1b <fct> Yes, Yes, No, Yes, Yes, Yes, Yes, No, Yes, Yes, Yes…
## $ md_q1c <fct> Yes, Yes, No, No, No, No, No, No, No, No, Yes, No, …
## $ md_q1d <fct> Yes, Yes, No, No, Yes, Yes, No, No, Yes, No, Yes, N…
## $ md_q1e <fct> No, Yes, Yes, Yes, No, Yes, Yes, No, Yes, Yes, Yes,…
## $ md_q1f <fct> Yes, Yes, Yes, Yes, Yes, Yes, Yes, Yes, Yes, No, Ye…
## $ md_q1g <fct> Yes, Yes, Yes, Yes, Yes, Yes, No, Yes, Yes, Yes, Ye…
## $ md_q1h <fct> Yes, Yes, No, Yes, No, Yes, No, No, No, No, Yes, No…
## $ md_q1i <fct> Yes, Yes, No, Yes, No, Yes, No, No, No, No, Yes, No…
## $ md_q1j <fct> Yes, No, No, No, No, Yes, No, No, No, Yes, No, No, …
## $ md_q1k <fct> Yes, No, No, No, No, Yes, No, No, No, Yes, Yes, No,…
## $ md_q1l <fct> No, Yes, No, Yes, No, Yes, Yes, Yes, Yes, Yes, Yes,…
## $ md_q1m <fct> Yes, No, No, Yes, No, No, No, No, Yes, Yes, Yes, No…
## $ md_q2 <fct> Yes, Yes, No, Yes, Yes, Yes, Yes, Yes, Yes, Yes, Ye…
## $ md_q3 <fct> Serious, Serious, Moderate, Serious, Moderate, Seri…
## $ md_total <int> 15, 14, 5, 13, 7, 14, 9, 7, 12, 11, 16, 0, 11, 10, …
## $ alcohol <fct> Use, No Use, No Use, Use, Use, Use, Dependence, No …
## $ thc <fct> Use, No Use, No Use, Use, Use, No Use, Dependence, …
## $ cocaine <fct> Use, No Use, No Use, Use, No Use, No Use, Use, No U…
## $ stimulants <fct> No Use, No Use, No Use, Use, No Use, No Use, Use, N…
## $ sedative_hypnotics <fct> No Use, No Use, No Use, No Use, No Use, No Use, Use…
## $ opioids <fct> No Use, No Use, No Use, No Use, No Use, No Use, No …
## $ court_order <fct> Yes, No, No, No, Yes, No, No, No, No, No, No, No, N…
## $ education <int> 11, 14, 12, 12, 9, 11, 12, 16, 12, 9, 12, 18, 12, 1…
## $ hx_of_violence <fct> No, No, No, No, Yes, No, Yes, Yes, Yes, No, Yes, No…
## $ disorderly_conduct <fct> Yes, No, No, No, Yes, Yes, Yes, Yes, Yes, Yes, Yes,…
## $ suicide <fct> Yes, Yes, No, Yes, Yes, Yes, No, No, No, No, Yes, N…
## $ abuse <fct> "No", "Physical & Sexual", "Sexual & Emotional", "P…
## $ non_subst_dx <fct> More than one, One, More than one, More than one, M…
## $ subst_dx <fct> None, None, None, None, None, None, None, One, None…
## $ psych_meds <fct> More than one, One, One, More than one, None, None,…
From this output, we can summarize each dataset feature as follows:
| Column Numbers | Column Labels | Description |
|---|---|---|
| 1 | initial |
(string) subject’s initials |
| 2 | age |
(numeric) integer values (years) |
| 3 | sex |
(categorical): binary (‘male’ and ‘female’) |
| 4 | race |
(categorical) |
| 5-22 | adhd_q1-adhd_q18 |
(categorical) ordinal values |
| 23 | adhd_total |
(numeric): summary feature derived from adhd_q1-adhd_q18 |
| 24-38 | md_q1a-md_q3 |
(categorical) ordinal values |
| 39 | md_total |
(numeric): summary feature derived from md_q1a-md_q3 |
| 40 | alcohol |
(categorical): ordinal values |
| 41 | thc |
(categorical): ordinal values |
| 42 | cocaine |
(categorical): ordinal values |
| 43 | stimulants |
(categorical): ordinal values |
| 44 | sedative_hypnotics |
(categorical): ordinal values |
| 45 | opioids |
(categorical): ordinal values |
| 46 | court_order |
(categorical): binary (yes/no) |
| 47 | education |
(numeric): interger values (years) |
| 48 | hx_of_violence |
(categorical): binary (yes/no) |
| 49 | disorderly_conduct |
(categorical): binary (yes/no) |
| 50 | suicide |
(categorical): binary (yes/no) |
| 51 | abuse |
(categorical): ordinal values |
| 52 | non_subst_dx |
(categorical): ordinal values |
| 53 | subst_dx |
(categorical): ordinal values |
| 54 | psych_meds |
(categorical): ordinal values |
The columns adhd_q1-adhd_q18 and md_q1a-md_q3 are summarized by the derivative columns adhd_total and md_total respectively. The adhd features correspond to an ADHD self-report survey whereas the md features give responses to a mood disorder self-report survey. For the initial Exploratory Data Analysis, the individual questions responses will be dropped in place of visualizations and summary statistics on the derived columns, adhd_total and md_total. A detailed analysis of the individual survey questions will be taken up in the Principal Components Analysis section.
Next, we visualize the extent of the missing values using the naniar library.
Use naniar’s miss_var_summary() and vis_miss() functions to summarize and visualize the missing values in the features of the dataset:
# return a summary table of the missing data in each column
miss_var_summary(df)## # A tibble: 54 × 3
## variable n_miss pct_miss
## <chr> <int> <dbl>
## 1 psych_meds 118 67.4
## 2 subst_dx 23 13.1
## 3 non_subst_dx 22 12.6
## 4 abuse 14 8
## 5 suicide 13 7.43
## 6 hx_of_violence 11 6.29
## 7 disorderly_conduct 11 6.29
## 8 education 9 5.14
## 9 court_order 5 2.86
## 10 alcohol 4 2.29
## # … with 44 more rows
# visualize the amount of missing data for each feature
vis_miss( df, cluster = TRUE )The figure above shows a grouped view of the missing values in each feature column. Overall, 2.7% of the values are missing from the dataset. The majority of the features have no missing values. Many of the features have relatively few missing values. However, the psych_meds features is missing a considerable portion of the data.
df <- df %>%
select( -c(psych_meds) )Having dropped psych_meds, we can now use the gg_miss_upset() function to look for any remaining patterns of missing values in the data.
gg_miss_upset( df )Both the vis_miss() and gg_miss_upset() visualization are informative, because they reveal some interesting patterns in the missing values for the data set. For instance, once we remove psych_meds most of the remaining rows that are not full, hold multiple missing values from a subset of the features. In other words, the remaining missing values are not randomly dispersed across the data set. Rather, they are concentrated in a subset of the features: disorderly_conduct, suicide, abuse, non_subst_dx and subst_dx. Notably, the responses that correspond to the ADHD and Mood Disorder self-reporting surveys are have no missing values; this knowledge will be helpful downstream. Further handling of missing values will be tailored to the analysis of each of the sections below.
To gain a feel for the data, we will evaluate visualizations of select features.
Summary Statistics of Demographic Variables
# summary of each demographic feature
demo_df <- df %>%
select(c('age','sex','race','education'))#select( -c( 5:22, 24:38 ) )
describe( demo_df )## demo_df
##
## 4 Variables 175 Observations
## --------------------------------------------------------------------------------
## age
## n missing distinct Info Mean Gmd .05 .10
## 175 0 42 0.999 39.47 12.8 22.0 24.0
## .25 .50 .75 .90 .95
## 29.5 42.0 48.0 53.0 56.0
##
## lowest : 18 19 20 21 22, highest: 55 56 57 61 69
## --------------------------------------------------------------------------------
## sex
## n missing distinct
## 175 0 2
##
## Value Male Female
## Frequency 99 76
## Proportion 0.566 0.434
## --------------------------------------------------------------------------------
## race
## n missing distinct
## 175 0 4
##
## Value White African American Hispanic
## Frequency 72 100 1
## Proportion 0.411 0.571 0.006
##
## Value Other or Missing Data
## Frequency 2
## Proportion 0.011
## --------------------------------------------------------------------------------
## education
## n missing distinct Info Mean Gmd .05 .10
## 166 9 14 0.929 11.9 2.265 8.25 9.00
## .25 .50 .75 .90 .95
## 11.00 12.00 13.00 14.00 16.00
##
## lowest : 6 7 8 9 10, highest: 15 16 17 18 19
##
## Value 6 7 8 9 10 11 12 13 14 15 16
## Frequency 2 2 5 12 12 23 67 15 14 1 7
## Proportion 0.012 0.012 0.030 0.072 0.072 0.139 0.404 0.090 0.084 0.006 0.042
##
## Value 17 18 19
## Frequency 2 3 1
## Proportion 0.012 0.018 0.006
## --------------------------------------------------------------------------------
Visualizing the demographic information of the mental health study
p1 <- ggplot( df, aes(x = age) ) +
geom_density( fill="#69b3a2", color="#e9ecef", alpha=0.8 ) +
theme_classic() +
ggtitle( 'Age Density Plot' )
num_obs <- dim(df)[1]
p2 <- df %>%
count( sex ) %>%
mutate( n = n/num_obs ) %>%
ggplot( aes(x=sex, y=n)) +
geom_bar(stat = "identity") +
ylim( 0, 0.6 ) +
ylab( 'proportion' ) +
ggtitle( 'Distribution of Sex (binary)' ) +
theme_classic()
p3 <- df %>%
count( race ) %>%
mutate( n = n/num_obs ) %>%
ggplot( aes(x=race, y=n)) +
geom_bar(stat = "identity") +
ylab( 'proportion' ) +
ylim( 0, 0.6 ) +
ggtitle( 'Distribution of Race' ) +
theme_classic() +
scale_x_discrete(guide = guide_axis(n.dodge = 2))
p4 <- ggplot( df, aes(x = education) ) +
geom_density( fill="#69b3a2", color="#e9ecef", alpha=0.8 ) +
theme_classic() +
ggtitle( 'Education Density Plot' )
num_obs <- dim(df)[1]
grid.arrange(p1, p4, p2, p3, ncol=2)From these visualization we learn several things about the mental health study participants:
Summary Statistics for self-report surveys
# summary of each field
initial_eda_df <- df %>%
select( c('adhd_total', 'md_total' ) )
describe( initial_eda_df )## initial_eda_df
##
## 2 Variables 175 Observations
## --------------------------------------------------------------------------------
## adhd_total
## n missing distinct Info Mean Gmd .05 .10
## 175 0 62 0.999 34.32 19.16 7.0 12.0
## .25 .50 .75 .90 .95
## 21.0 33.0 47.5 55.0 62.3
##
## lowest : 0 1 3 5 6, highest: 65 67 69 71 72
## --------------------------------------------------------------------------------
## md_total
## n missing distinct Info Mean Gmd .05 .10
## 175 0 18 0.995 10.02 5.469 0.7 3.0
## .25 .50 .75 .90 .95
## 6.5 11.0 14.0 16.0 17.0
##
## lowest : 0 1 2 3 4, highest: 13 14 15 16 17
##
## Value 0 1 2 3 4 5 6 7 8 9 10
## Frequency 9 3 5 6 4 7 10 6 8 12 13
## Proportion 0.051 0.017 0.029 0.034 0.023 0.040 0.057 0.034 0.046 0.069 0.074
##
## Value 11 12 13 14 15 16 17
## Frequency 18 12 13 12 14 12 11
## Proportion 0.103 0.069 0.074 0.069 0.080 0.069 0.063
## --------------------------------------------------------------------------------
Visualizing the adhd_total and md_total features. These two features give a summary score for the questions on an ADHD Self-Reporting Survey and a separate Mood Disorder Questionnaire.
p1 <- ggplot( df, aes(x = adhd_total) ) +
geom_density( fill="#69b3a2", color="#e9ecef", alpha=0.8 ) +
theme_classic() +
ggtitle( 'ADHD Self-Reporting Survey Scores' )
p2 <- ggplot( df, aes(x = md_total) ) +
geom_density( fill="#9D85FC", color="#e9ecef", alpha=0.8 ) +
theme_classic() +
ggtitle( 'Mood Disorder Self-Reporting Survey Scores' )
grid.arrange( p1, p2, ncol=2 )adhd_total has a roughly normal distribution with a mean centered at 34.3 and a range of 0 to 72. md_total has an entirely different range from 0 to 17; it’s distribution is more left-skewed than ADHD.
Next we visualize adhd_total ~ md_total to evaluate the relationship between the two total scores
ggplot(df, aes(x=md_total, y=adhd_total)) +
geom_point()+
geom_smooth(method=lm) +
theme_classic() +
ggtitle('ADHD total ~ Mood Disorder total')We can evaluate the fit of a linear model to the data:
fit = lm( adhd_total ~ md_total, df )
summary( fit )##
## Call:
## lm(formula = adhd_total ~ md_total, data = df)
##
## Residuals:
## Min 1Q Median 3Q Max
## -39.718 -8.672 -1.941 9.163 38.713
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 16.5104 2.5173 6.559 0.000000000603804 ***
## md_total 1.7769 0.2265 7.844 0.000000000000432 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 14.38 on 173 degrees of freedom
## Multiple R-squared: 0.2623, Adjusted R-squared: 0.2581
## F-statistic: 61.53 on 1 and 173 DF, p-value: 0.0000000000004319
par(mfrow = c(2, 2))
p1 <- plot(fit, which=1)
p2 <- plot(fit, which=2)
p3 <- plot(fit, which=3)
p4 <- plot(fit, which=4)par(mfrow = c(1, 1))The fit is statistically significant and the diagnostics suggest that the assumptions for linearity are reasonably met. Therefore, we can say with confidence that for a given point increase in Mood Disorder Score, the average ADHD score increases 1.7769 points.
For an EDA of the individual question responses for both the Mood Disorder and ADHD self-reporting surveys, please see the PCA section.
Summary Statistics for Substance Misuse Features
# summary of each field
sub_df <- df %>%
select( c( 'alcohol', 'thc', 'cocaine', 'stimulants', 'sedative_hypnotics', 'opioids', 'subst_dx' ) )
describe( sub_df )## sub_df
##
## 7 Variables 175 Observations
## --------------------------------------------------------------------------------
## alcohol
## n missing distinct
## 171 4 4
##
## Value No Use Use Abuse Dependence
## Frequency 80 18 7 66
## Proportion 0.468 0.105 0.041 0.386
## --------------------------------------------------------------------------------
## thc
## n missing distinct
## 171 4 4
##
## Value No Use Use Abuse Dependence
## Frequency 116 12 3 40
## Proportion 0.678 0.070 0.018 0.234
## --------------------------------------------------------------------------------
## cocaine
## n missing distinct
## 171 4 4
##
## Value No Use Use Abuse Dependence
## Frequency 101 9 5 56
## Proportion 0.591 0.053 0.029 0.327
## --------------------------------------------------------------------------------
## stimulants
## n missing distinct
## 171 4 3
##
## Value No Use Use Dependence
## Frequency 160 6 5
## Proportion 0.936 0.035 0.029
## --------------------------------------------------------------------------------
## sedative_hypnotics
## n missing distinct
## 171 4 4
##
## Value No Use Use Abuse Dependence
## Frequency 161 4 1 5
## Proportion 0.942 0.023 0.006 0.029
## --------------------------------------------------------------------------------
## opioids
## n missing distinct
## 171 4 3
##
## Value No Use Use Dependence
## Frequency 146 4 21
## Proportion 0.854 0.023 0.123
## --------------------------------------------------------------------------------
## subst_dx
## n missing distinct
## 152 23 4
##
## Value None One Two Three or more
## Frequency 42 61 35 14
## Proportion 0.276 0.401 0.230 0.092
## --------------------------------------------------------------------------------
The individual substance features (e.g. opioids) describe the severity with which a participants uses or does not use a particular substance. On the other hand, subst_dx counts how many substances a participant has a reported using irrespective of severity of use. The following figure will attempt to find patterns between the individual substances and the summary feature.
none_count <- sum(sub_df$subst_dx == 'None')
one_count <- sum(sub_df$subst_dx == 'One')
two_count <- sum(sub_df$subst_dx == 'Two')
more_count <- sum(sub_df$subst_dx == 'Three or more')
total_count <- dim(sub_df)[1]
plot_df <- sub_df %>%
drop_na() %>%
#mutate_if(is.factor, as.numeric) %>%
gather(var, value, -subst_dx) %>%
group_by(var, value, subst_dx) %>%
dplyr::summarize(count = n(),
.groups = 'drop') %>%
dplyr::mutate(prop = count / case_when(subst_dx == 'None' ~ none_count,
subst_dx == 'One' ~ one_count,
subst_dx == 'Two' ~ two_count,
subst_dx == 'Three or more' ~ more_count))
plot_df$value <- factor(plot_df$value,levels = c("No Use", "Use", "Dependence", "Abuse"))
ggplot(plot_df, aes(x = value, y = count, fill = subst_dx)) +
geom_col(position="stack") +#, position = 'dodge') +
facet_wrap(~var, scales = 'free') +
theme_bw() +
labs(y = 'count',
x = element_blank(),
title = 'Distributions For Substances') #+ #scale_y_continuous(labels = percent_format(accuracy = 1))From the figure above, we see that for each individual substance, the majority of participants report ‘No use’. However, the instances of ‘Dependence’ are elevated for several substances notably alcohol, cocaine and THC. For these substances it is the case that there is a substantial proportion of participants that also reported a substance abuse diagnosis (subst_dx) of three or more substances. There is another interesting result in this figure: there are many instances where a level of substance use was self-reported for a participant that also reported ‘None’ for substance abuse diagnosis. It is interesting that a participant can simultaneously report to, for example, have a dependence on alcohol yet report ‘None’ for a substance abuse diagnosis. Perhaps more documentation on the dataset can shed light on this.
To re-represent this data, we will generate a new summary feature that sums the reported sevarity of use for each substance included in the self report:
sub_df <- sub_df %>%
mutate_if(is.factor, as.numeric) %>%
mutate( subst_sum = alcohol + thc + cocaine + stimulants + sedative_hypnotics + opioids )We can derive a similar summary variable for the non-substance abuse behaviors reported (e.g. disorderly conduct, court order etc.). Hoever, we exclude abuse because it has a difference and non-binary reporting criteria.
abuse_df <- df %>%
select( c( 'suicide', 'disorderly_conduct', 'hx_of_violence', 'court_order' ) ) %>%
mutate_if(is.factor, as.numeric) %>%
mutate( abuse_sum = suicide + disorderly_conduct + hx_of_violence + court_order )Visualize the distribution of the new summary variables
p1 <- ggplot( sub_df, aes(x = subst_sum) ) +
geom_density( fill="#69b3a2", color="#e9ecef", alpha=0.8 ) +
theme_classic() +
ggtitle( 'ADHD Self-Reporting Survey Scores' )
p2 <- ggplot( abuse_df, aes(x = abuse_sum) ) +
geom_density( fill="#9D85FC", color="#e9ecef", alpha=0.8 ) +
theme_classic() +
ggtitle( 'Mood Disorder Self-Reporting Survey Scores' )
grid.arrange( p1, p2, ncol=2 )For one last exploratory visualization of the data, we will look at correlations between the self-reporting survey summary features (adhd_total & md_total) and the new summary feature generated here. Obviously, a positive correlation is expected between the two survey totals, because we demonstrated the linear regression earlier.
df_feats <- df %>%
select( c('adhd_total','md_total') ) %>%
mutate( subst_sum = as.integer( sub_df$subst_sum ),
abuse_sum = as.integer( abuse_df$abuse_sum ) ) %>%
drop_na()
corrplot(cor(df_feats), method="number") From the correlation plot above, we do not see a strong relationship between the self-reported survey total scores and the new features we generated to summarize substance abuse and non-substance abuses. However, in upcoming sections we will use machine learning techniques to investigate this relationship further.
In order to find patients of clusters, we first need to determine the number of clusters. Below, we’ll use two common methods, evaluating the sum of squares and silhouette width, in order to decide how many clusters we’ll use.
# exclude names
df3 <- df_raw %>%
select(-initial)
# impute null
preproc <- preProcess(df3, 'bagImpute')
df4 <- predict(preproc, df3)fviz_nbclust(df4, kmeans, method = "wss")Based on the sum of squares for each value of K, we can use 2 clusters as the ‘elbow’ point.
fviz_nbclust(df4, kmeans, method = "silhouette")Based on the average silhouette width of clusters, we can proceed with K = 2.
In this case, both methods have a consensus on two clusters.
Once the clusters are calculated, let’s check the size of each cluster to make sure we’re looking at different groups.
# 25 random starts
cluster_k2 <- kmeans(df4, 2, nstart = 25)
# add cluster number to dataframe
df4$cluster_k2 <- cluster_k2$cluster
# check cluster sizes
table(df4$cluster_k2)##
## 1 2
## 88 87
Using K = 2, we created an almost even split of observations. Cluster 1 has 88 observations and cluster 2 has 87.
# adhd vs suicide
fviz_cluster(cluster_k2, data = df4, c('adhd_total', 'suicide'))The clusters created show a clear delineation for respondents based on adhd_total. This suggests that the clusters are separated based on levels of ADHD.
# mood disorder vs suicide
fviz_cluster(cluster_k2, data = df4, c('md_total', 'suicide'))The results of the k2 cluster show considerable overlap based on md_total. This means that the clusters aren’t separated by mood disorder levels.
Let’s examine relative distributions for each cluster and variable.
# function to plot relative variable distributions for clusters
cluster_plot <- function(start_col, end_col, plot = 'density', legend_position = 'bottom') {
temp_df <- df4[,start_col:end_col] %>%
bind_cols(cluster = factor(df4$cluster_k2)) %>%
gather(var, val, -cluster)
if (plot == 'density') {
p <- ggplot(temp_df, aes(x = val, fill = cluster)) +
geom_density(alpha = 0.3)
} else if (plot == 'histogram') {
p <- ggplot(temp_df, aes(x = val, fill = cluster)) +
geom_histogram(alpha = 0.3)
}
p +
facet_wrap(~var, scales = 'free') +
theme_bw() +
labs(fill = 'Cluster',
x = element_blank(),
y = element_blank()
) +
theme(axis.ticks.y = element_blank(),
axis.text.y = element_blank(),
axis.text.x = element_text(size = 8),
legend.position = legend_position)
}cluster_plot(1,3)cluster_plot(4,22)With clear differences for each question as well as the total, cluster 1 reported higher adhd values relative to cluster 2.
cluster_plot(23,38)Some questions have similiar distributions, but generally overall as well as altogether, cluster 1 is more likely to report higher in mood disorders.
cluster_plot(39,44)cluster_plot(51,53)Cluster 2 has a higher chance of not taking non-substance abuse related drugs and is more likely to take one substance related drug.
In this section we will explore the ADHD data set further using Principal Component Analysis (PCA). PCA is a dimensionality reduction method that can be very useful for detecting patterns in complex, feature-rich data. At it’s core, PCA is a fairly simple linear algebra manipulation, an eigendecomposition. Basically, PCA maps the data onto a new set of components that consist of orthogonal axes (eigenvectors) and magnitudes (eigenvalues). Each successive component is added such that it describes as much remaining variance in the data as possible. From the resulting components we can select a subset that describe most of the variance in a new, lower-dimensional feature space which is more convenient for observing relationships in the data that were previously masked by the data’s complexity.
To approach this analysis, we derive 2 sets of features from the original data. Data features that correspond to 1) self-reported answers for an ADHD survey, 2) self-reported answers for a Mood Disorder survey.
The Adult ADHD Self-Reporting Scale Symptoms Checklist is a survey consisting of 18 questions that all range on an ordinal categorical scale: ‘Never’, ‘Rarely’, ‘Sometimes’, ‘Often’, ‘Very Often’
To prepare the ADHD survey responses we performed the following:
adhd_total which gives a summary score for the survey# preparing the ADHD self-report questionnaire data
adhd_df <- df %>%
select( starts_with("adhd_") ) %>%
select( -'adhd_total' )
# recode all questions from factor to numeric values
adhd_df <- adhd_df %>%
mutate_at(
vars(matches("adhd_q") ),
funs( recode_factor( ., 'Never'=0,'Rarely'=1,'Sometimes'=2,'Often'=3,'Very Often'=4 ) ) ) %>%
mutate_if( is.factor, as.numeric )
paste( 'initial number of rows:', dim( adhd_df )[1] )## [1] "initial number of rows: 175"
Summary of missing responses in the ADHD Self-Reporting Survey:
# return a summary table of the missing data in each column
miss_var_summary( adhd_df )## # A tibble: 18 × 3
## variable n_miss pct_miss
## <chr> <int> <dbl>
## 1 adhd_q5 1 0.571
## 2 adhd_q1 0 0
## 3 adhd_q2 0 0
## 4 adhd_q3 0 0
## 5 adhd_q4 0 0
## 6 adhd_q6 0 0
## 7 adhd_q7 0 0
## 8 adhd_q8 0 0
## 9 adhd_q9 0 0
## 10 adhd_q10 0 0
## 11 adhd_q11 0 0
## 12 adhd_q12 0 0
## 13 adhd_q13 0 0
## 14 adhd_q14 0 0
## 15 adhd_q15 0 0
## 16 adhd_q16 0 0
## 17 adhd_q17 0 0
## 18 adhd_q18 0 0
We see that there is only 1 missing value, so it seems reasonable to just drop it
# remove missing
adhd_dna <- adhd_df %>%
drop_na()
paste( 'after drop_na number of rows:', dim( adhd_dna )[1] )## [1] "after drop_na number of rows: 174"
We can perform PCA easily from scratch and use the intermediate steps to learn more about the relationship of the individual question features to the ADHD total score:
scale() functioncov() function# scale the data to facilitate PCA
adhd_df_scaled <- scale( adhd_dna )
# create a covariance matrix
adhd_df_cov <- cov( adhd_df_scaled )Visualizing the covariance matrix
pheatmap(adhd_df_cov, display_numbers = T, color = colorRampPalette(c('white','red'))(100), cluster_rows = F, cluster_cols = F, fontsize_number = 15)The values off the diagonal of the matrix describe the degree of covariance between the two features. We see that some pairs of questions are more closely related than others. For instance, questions #5 and #13 have the highest covariance:
adhd_q5 - “How often do you fidget or squirm with your hands or feet when you have to sit down for a long time?”adhd_q13 - “How often do you feel restless or fidgety”Perhaps it is no surprise that subjects who report the frequency they fidget when sitting down for a long time report a similar incidence of general fidgeting/squirming.
Another closely covarying pair of questions are #16 and #18
adhd_q16 - “When you’re in a conversation, how often do you find yourself finishing the sentences of the people you are talking to, before they can finish them themselves?”adhd_q18 - “How often do you interrupt others when they are busy?”Again, these are two very similar questions.
It is equally interesting to evaluate the questions with the lowest covariance, questions #3 and #16(see above).
adhd_q3 - “How often do you have problems remembering appointments or obligations?”Indeed, conversation skills and the ability to remember appointments are two (subjectively) different tasks, so it is reasonable to not see a strong relation between the two.
Moving on, the covariance matrix can now be used for an eigendecomposition to find the principal components that describe the most variance in the data
# eigendecomposition of the covariance matrix
eig <- eigen( adhd_df_cov )
explained_variance <- eig$values
components <- eig$vectors
explained_variance## [1] 9.3075284 1.3703108 1.0189437 0.8140854 0.7215297 0.5939121 0.5529787
## [8] 0.5013390 0.4471306 0.4144583 0.3953239 0.3693263 0.3523467 0.2795267
## [15] 0.2734344 0.2160613 0.2060649 0.1656991
Now, by storing the eigen values in a dataframe, we can explore the amount of variance explained by the PCA components. Furthermore, we find the ratio of explained variance by normalizing the cumulative variance of the components
compnum <- 1:length( explained_variance )
exv_cumsum <- cumsum( explained_variance )/length( explained_variance )
pca_res <- data.frame( compnum, explained_variance, exv_cumsum )
pca_res## compnum explained_variance exv_cumsum
## 1 1 9.3075284 0.5170849
## 2 2 1.3703108 0.5932133
## 3 3 1.0189437 0.6498213
## 4 4 0.8140854 0.6950482
## 5 5 0.7215297 0.7351332
## 6 6 0.5939121 0.7681283
## 7 7 0.5529787 0.7988494
## 8 8 0.5013390 0.8267015
## 9 9 0.4471306 0.8515421
## 10 10 0.4144583 0.8745676
## 11 11 0.3953239 0.8965300
## 12 12 0.3693263 0.9170482
## 13 13 0.3523467 0.9366230
## 14 14 0.2795267 0.9521522
## 15 15 0.2734344 0.9673430
## 16 16 0.2160613 0.9793464
## 17 17 0.2060649 0.9907945
## 18 18 0.1656991 1.0000000
Visualizing the cumulative explained variance described by the principal components with a Skree Plot:
ggplot( pca_res, aes( x = compnum, y = exv_cumsum ) ) +
geom_line() +
ylim( c(0,1.1) ) +
scale_x_continuous(breaks = seq(0, length( explained_variance ), by = 1)) +
geom_hline( yintercept = 0.95, linetype = 'dotted', col = 'red') +
annotate("text", x = 2, y = 0.98,
label = expression( "95%" ~ sigma), vjust = -0.5) +
theme_minimal() +
xlab( 'Principal Component Number' ) +
ylab( 'Cummulative Explained Variance' ) +
ggtitle( 'Scree Plot: Explained Variance of Principal Components' )The figure above plots the cumulative explained variance of the ADHD survey questions and total score as a result of the eigendecomposition. We see that the first principal component accounts for 51.7% of the total variance. Adding successive components gradually increases the cumulative explained variance. It takes as many as 14 components to describe 95% of the data’s variance. Therefore, dimensionality reduction by way of PCA only moderately decreases the components necessary to adequately explain the data (with a threshold of 95%).
prcomp()We can determine the principal components of the ADHD data more quickly using the prcomp() function from the stats library.
# use the prcomp function
adhd_dna.pca <- prcomp( adhd_dna, center = TRUE, scale = TRUE )
# print the pca summary
summary( adhd_dna.pca )## Importance of components:
## PC1 PC2 PC3 PC4 PC5 PC6 PC7
## Standard deviation 3.0508 1.17060 1.00943 0.90227 0.84943 0.7707 0.74363
## Proportion of Variance 0.5171 0.07613 0.05661 0.04523 0.04008 0.0330 0.03072
## Cumulative Proportion 0.5171 0.59321 0.64982 0.69505 0.73513 0.7681 0.79885
## PC8 PC9 PC10 PC11 PC12 PC13 PC14
## Standard deviation 0.70805 0.66868 0.64378 0.62875 0.60772 0.59359 0.52870
## Proportion of Variance 0.02785 0.02484 0.02303 0.02196 0.02052 0.01957 0.01553
## Cumulative Proportion 0.82670 0.85154 0.87457 0.89653 0.91705 0.93662 0.95215
## PC15 PC16 PC17 PC18
## Standard deviation 0.52291 0.4648 0.45394 0.40706
## Proportion of Variance 0.01519 0.0120 0.01145 0.00921
## Cumulative Proportion 0.96734 0.9794 0.99079 1.00000
It is interesting to compare the Cumulative Proportion from the prcomp() output with the feature pca_res$exv_cumsum; the results are the same for both the built-in PCA function as well as are PCA from scratch (a relief!).
The PCA object that results from prcomp() can be used to visualize the components using the ggbiplot library:
library( ggbiplot )
ggbiplot( adhd_dna.pca )The biplot shown above renders the data in the feature space of principal components PC1 and PC2; these are the components that explain most of the variance in the data. The scatterplot points show the projection of the data onto PC1 & PC2. The red arrows represent the original feature space that the data was represented in. We can see that the original features have the largest projection onto the PC1 axis. Relatively few original features have a sizeable projection onto PC2 (e.g. adhd_q16 & adhd_q17). Additionally, these vectors all have a positive component along PC1 showing that the result for each question has a positive relation to PC1. The size of the angle between a given pair of vectors is inversely related to the correlation of the pair. For example, we see that adhd_q3 and adhd_q16 have the largest angle between them; this corresponds to our earlier observation looking at the covariance matrix.
The Mood Disorder Questionnaire is a self-reporting survey that can be used to identify subjects that may be bipolar.
We can perform a similar PCA analysis for the Mood Disorder survey results. Taking similar preprocessing steps: 1) select mood disorder question features, evaluate missing values…
# prepare the mood disorder data and recode all questions from factor to numeric values
md_df <- df %>%
select( starts_with("md_") ) %>%
select( -'md_total' ) %>%
mutate_if( is.factor, as.numeric )
# return a summary table of the missing data in each column
miss_var_summary( md_df )## # A tibble: 15 × 3
## variable n_miss pct_miss
## <chr> <int> <dbl>
## 1 md_q1a 0 0
## 2 md_q1b 0 0
## 3 md_q1c 0 0
## 4 md_q1d 0 0
## 5 md_q1e 0 0
## 6 md_q1f 0 0
## 7 md_q1g 0 0
## 8 md_q1h 0 0
## 9 md_q1i 0 0
## 10 md_q1j 0 0
## 11 md_q1k 0 0
## 12 md_q1l 0 0
## 13 md_q1m 0 0
## 14 md_q2 0 0
## 15 md_q3 0 0
Wonderful, there are no missing values, so we can proceed with PCA using prcomp()…
# use the prcomp function
md_df.pca <- prcomp( md_df, center = TRUE, scale = TRUE )
# print the pca summary
summary( md_df.pca )## Importance of components:
## PC1 PC2 PC3 PC4 PC5 PC6 PC7
## Standard deviation 2.3857 1.3474 0.9721 0.94891 0.91563 0.82515 0.79246
## Proportion of Variance 0.3794 0.1210 0.0630 0.06003 0.05589 0.04539 0.04187
## Cumulative Proportion 0.3794 0.5004 0.5635 0.62348 0.67937 0.72476 0.76663
## PC8 PC9 PC10 PC11 PC12 PC13 PC14
## Standard deviation 0.77044 0.74286 0.72814 0.70302 0.65162 0.5835 0.55685
## Proportion of Variance 0.03957 0.03679 0.03535 0.03295 0.02831 0.0227 0.02067
## Cumulative Proportion 0.80620 0.84299 0.87834 0.91128 0.93959 0.9623 0.98296
## PC15
## Standard deviation 0.50553
## Proportion of Variance 0.01704
## Cumulative Proportion 1.00000
Now to visualize the PCA biplot:
ggbiplot( md_df.pca ) + xlim( -2,2 )A similar pattern emerges for the Mood Disorder survey responses. We see that all the projections again have a major component along PC1. Again, we can infer relationships between features from the biplot: md_q1c and md_q3 have the widest angle separating them and are therefore the least correlated features. where:
md_q1c - Has there been a period of time when (…) you felt much more self-confident than usual
md_q3 - How much of a problem did any of these cause you - being unable to work; having family, money or legal troubles; getting into arguments or fights
Responses to md_q1c with a ‘Yes’ reflect a positive affect with a larger numeric value. Conversely, larger numeric values for md_q3 are associated with a negative aspect. In light of the opposite score polarities, it is not surprising to see a lack of correlation between these two features in the PC1/PC2 space.
On major difference between the Mood Disorder and ADHD biplot is that all the feature vectors point in a negative direction in relation to PC1. This suggests inverse relationship between the features variables and PC1. You may recall from the previous section that the responses to the ADHD survey were all positive in relation to PC1.
Gradient Boosting builds a prediction model from an ensemble of smaller ‘weak learner’ models. In this section we will implement the gradient boosting approach to fit a binary classification model to features from the ADHD data set to predict whether a patient attempts suicide.
Before fitting a Gradient Boost Model, we will collect a new subset of features for this approach. Instead of including all features for individual questions on either of the summaries, we will only be using the total scores (adhd_total and md_total) because the total scores are composite features build from the scores on the questions. Additionally, the categorical responses
ordinal_cats <- c( 'alcohol', 'thc', 'cocaine', 'stimulants', 'sedative_hypnotics', 'opioids', 'suicide',
'court_order', 'hx_of_violence', 'disorderly_conduct', 'non_subst_dx', 'subst_dx')
gboost_df <- df %>%
select( -starts_with("md_q") ) %>%
select( -starts_with("adhd_q")) %>%
select( -c('initial') ) %>%
dplyr::mutate( across( all_of( ordinal_cats ), as.numeric ) )
paste( 'number of rows:', dim( gboost_df )[1] )## [1] "number of rows: 175"
glimpse( gboost_df )## Rows: 175
## Columns: 19
## $ age <int> 24, 48, 51, 43, 34, 39, 41, 48, 44, 27, 44, 56, 53,…
## $ sex <fct> Male, Female, Female, Male, Male, Female, Female, M…
## $ race <fct> White, White, White, White, White, White, White, Wh…
## $ adhd_total <int> 40, 55, 31, 45, 48, 55, 54, 41, 56, 56, 42, 38, 31,…
## $ md_total <int> 15, 14, 5, 13, 7, 14, 9, 7, 12, 11, 16, 0, 11, 10, …
## $ alcohol <dbl> 2, 1, 1, 2, 2, 2, 4, 1, 2, 1, 4, 1, 2, 1, 4, 1, 1, …
## $ thc <dbl> 2, 1, 1, 2, 2, 1, 4, 1, 1, 4, 2, 1, 2, 1, 2, 1, 1, …
## $ cocaine <dbl> 2, 1, 1, 2, 1, 1, 2, 1, 1, 1, 2, 1, 1, 1, 1, 1, 1, …
## $ stimulants <dbl> 1, 1, 1, 2, 1, 1, 2, 1, 1, 1, 2, 1, 1, 1, 1, 1, 1, …
## $ sedative_hypnotics <dbl> 1, 1, 1, 1, 1, 1, 2, 1, 1, 1, 2, 1, 1, 1, 1, 1, 1, …
## $ opioids <dbl> 1, 1, 1, 1, 1, 1, 1, 1, 2, 1, 4, 1, 1, 1, 1, 4, 1, …
## $ court_order <dbl> 2, 1, 1, 1, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 1, 1, …
## $ education <int> 11, 14, 12, 12, 9, 11, 12, 16, 12, 9, 12, 18, 12, 1…
## $ hx_of_violence <dbl> 1, 1, 1, 1, 2, 1, 2, 2, 2, 1, 2, 1, 1, 1, 2, 1, 1, …
## $ disorderly_conduct <dbl> 2, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 1, 2, 1, 1, …
## $ suicide <dbl> 2, 2, 1, 2, 2, 2, 1, 1, 1, 1, 2, 1, 1, 1, 1, 1, 1, …
## $ abuse <fct> "No", "Physical & Sexual", "Sexual & Emotional", "P…
## $ non_subst_dx <dbl> 3, 2, 3, 3, 3, 1, 2, 3, 2, 2, 2, 3, 1, 2, 1, 3, 3, …
## $ subst_dx <dbl> 1, 1, 1, 1, 1, 1, 1, 2, 1, 3, 3, 1, 1, 1, 2, 2, 1, …
Evaluate the presence of missing data
miss_var_summary( gboost_df )## # A tibble: 19 × 3
## variable n_miss pct_miss
## <chr> <int> <dbl>
## 1 subst_dx 23 13.1
## 2 non_subst_dx 22 12.6
## 3 abuse 14 8
## 4 suicide 13 7.43
## 5 hx_of_violence 11 6.29
## 6 disorderly_conduct 11 6.29
## 7 education 9 5.14
## 8 court_order 5 2.86
## 9 alcohol 4 2.29
## 10 thc 4 2.29
## 11 cocaine 4 2.29
## 12 stimulants 4 2.29
## 13 sedative_hypnotics 4 2.29
## 14 opioids 4 2.29
## 15 age 0 0
## 16 sex 0 0
## 17 race 0 0
## 18 adhd_total 0 0
## 19 md_total 0 0
gg_miss_upset( gboost_df )We see that the feature psych_meds is missing many values, so we will drop this feature from further consideration. Additionally, we see that there are approximately two dozen rows that are missing values for multiple features. Therefore, instead of imputing the missing, we will simply drop these rows.
gboost_df <- gboost_df %>%
#select( -psych_meds ) %>%
drop_na()
paste( 'number of rows:', dim( gboost_df )[1] )## [1] "number of rows: 142"
miss_var_summary( gboost_df )## # A tibble: 19 × 3
## variable n_miss pct_miss
## <chr> <int> <dbl>
## 1 age 0 0
## 2 sex 0 0
## 3 race 0 0
## 4 adhd_total 0 0
## 5 md_total 0 0
## 6 alcohol 0 0
## 7 thc 0 0
## 8 cocaine 0 0
## 9 stimulants 0 0
## 10 sedative_hypnotics 0 0
## 11 opioids 0 0
## 12 court_order 0 0
## 13 education 0 0
## 14 hx_of_violence 0 0
## 15 disorderly_conduct 0 0
## 16 suicide 0 0
## 17 abuse 0 0
## 18 non_subst_dx 0 0
## 19 subst_dx 0 0
We will begin by splitting the data into train and test sets
df2 <- df
# train test split
set.seed(101)
trainIndex <- createDataPartition(df2$suicide,
p = 0.75,
list = F)
train <- df2[trainIndex,]
test <- df2[-trainIndex,]
# cross validation train control
ctrl <- trainControl(method = 'cv', number = 10)indexes = createDataPartition(gboost_df$suicide, p = .90, list = F)
train = gboost_df[indexes, ]
test = gboost_df[-indexes, ]# Train model with preprocessing & repeated cv
model_gbm <- caret::train(suicide ~ .,
data = train,
method = "gbm",
preProcess = c("scale", "center"),
trControl = trainControl(method = "repeatedcv",
number = 10,
repeats = 3,
verboseIter = FALSE),
verbose = 0)
print(model_gbm)## Stochastic Gradient Boosting
##
## 128 samples
## 18 predictor
##
## Pre-processing: scaled (28), centered (28)
## Resampling: Cross-Validated (10 fold, repeated 3 times)
## Summary of sample sizes: 115, 115, 116, 115, 115, 115, ...
## Resampling results across tuning parameters:
##
## interaction.depth n.trees RMSE Rsquared MAE
## 1 50 0.4618525 0.11416303 0.4067475
## 1 100 0.4716013 0.10871921 0.4061297
## 1 150 0.4763428 0.11119615 0.4071995
## 2 50 0.4759660 0.09982923 0.4143810
## 2 100 0.4832988 0.10076528 0.4164349
## 2 150 0.4803671 0.11094950 0.4084625
## 3 50 0.4751328 0.09992148 0.4062867
## 3 100 0.4891577 0.09489353 0.4119593
## 3 150 0.4895725 0.10903967 0.4100652
##
## Tuning parameter 'shrinkage' was held constant at a value of 0.1
##
## Tuning parameter 'n.minobsinnode' was held constant at a value of 10
## RMSE was used to select the optimal model using the smallest value.
## The final values used for the model were n.trees = 50, interaction.depth =
## 1, shrinkage = 0.1 and n.minobsinnode = 10.
#caret::confusionMatrix(
# data = predict(model_gbm, test),
# reference = test$suicide
# )
Social
Cluster 2 is less likely to experience abuse and suicide, and more likely to have a history of violence and be charged with disorderly conduct